Goal: Do a PCA plot using Irene’s SNPs to look for relatedness among plants. Also figure out loading (which genes contribute most)
First, need to call SNPs using all BAM files at once:
working in directory 2019/IreneSnps on whitney
first, sort bams
for f in SNPanalysis/*LT*rmdup.bam
do
newname=`basename $f .bam`_sort.bam
samtools sort -o $newname --reference SNPanalysis/Pinsaporeference1 $f
done
next, assign read groups to bams
input=""
for f in `ls *sort.bam`
do
rg=`basename $f _rmdup_sort.bam`
input="$input -b $f -r $rg -s $rg"
done
echo $input
bamaddrg $input > LT_rmdup_sort_combined.bam
samtools index LT_rmdup_sort_combined.bam
freebayes -f SNPanalysis/Pinsaporeference1 --no-indels --no-mnps --no-complex LT_rmdup_sort_combined.bam > LT.vcf &
try parallel
ulimit -n 4000
/usr/local/stow/freebayes/scripts/fasta_generate_regions.py Pinsaporeference1.fai 100000 > regions
./freebayes-parallel regions 8 -f Pinsaporeference1 --no-indels --no-mnps --no-complex LT_rmdup_sort_combined.bam > LT.vcf
(note: I edited the freebayes-parallel script so that it would work…)
Freeybayes parallel takes about 12 hours
scp whitney.plb.ucdavis.edu:2019/IreneSnps/LT.vcf.gz ../input/
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.1 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 1.0.0 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(ggrepel)
get the vcf header
vcf.header <- system("zgrep '#C' ../input/LT.vcf.gz",intern = TRUE)
vcf.header
[1] "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t38LTR\t42LTR\t42LTRR\t43LTR\t43LTRR\t49LTWR\t49LTWRR\t95LTWR\t95LTWRR\t99LTWR"
vcf.header <- vcf.header %>%
str_replace("#","") %>% #get rid of the pound sign
str_split(pattern = "\t") %>% #split on the tabs
magrittr::extract2(1)
vcf.header
[1] "CHROM" "POS" "ID" "REF" "ALT" "QUAL" "FILTER" "INFO" "FORMAT" "38LTR" "42LTR" "42LTRR" "43LTR" "43LTRR" "49LTWR" "49LTWRR"
[17] "95LTWR" "95LTWRR" "99LTWR"
get the data
snps <- read_tsv("../input/LT.vcf.gz", na = c("","NA","."),comment="#",col_names = vcf.header) %>%
select(-ID, -FILTER) # these are empty columns
restarting interrupted promise evaluationParsed with column specification:
cols(
CHROM = [31mcol_character()[39m,
POS = [32mcol_double()[39m,
ID = [33mcol_logical()[39m,
REF = [31mcol_character()[39m,
ALT = [31mcol_character()[39m,
QUAL = [32mcol_double()[39m,
FILTER = [33mcol_logical()[39m,
INFO = [31mcol_character()[39m,
FORMAT = [31mcol_character()[39m,
`38LTR` = [31mcol_character()[39m,
`42LTR` = [31mcol_character()[39m,
`42LTRR` = [31mcol_character()[39m,
`43LTR` = [31mcol_character()[39m,
`43LTRR` = [31mcol_character()[39m,
`49LTWR` = [31mcol_character()[39m,
`49LTWRR` = [31mcol_character()[39m,
`95LTWR` = [31mcol_character()[39m,
`95LTWRR` = [31mcol_character()[39m,
`99LTWR` = [31mcol_character()[39m
)
snps
filter to keep snps where there is data from all samples
snps <- snps %>%
filter({select(., matches("[0-9]")) %>% complete.cases() })
snps
snps <- snps %>%
mutate(TOTAL_DEPTH= {str_extract(INFO, "DP=[0-9]*") %>%
str_remove("DP=") %>%
as.numeric() }
) %>%
filter(QUAL >=100,
nchar(ALT)==1,
TOTAL_DEPTH > quantile(TOTAL_DEPTH, 0.05),
TOTAL_DEPTH < quantile(TOTAL_DEPTH, 0.95))
snps
unpack the information differnet samples:
samples <- colnames(snps) %>% str_subset("^[0-9]")
for (s in samples) {
snps <- snps %>%
separate(!!s, into=paste(s,c("gt","tot.depth","allele.depth","ref.depth","ref.qual","alt.depth","alt.qual","gt.lik"),sep="_"),
sep=":", convert = TRUE)
}
snps
For the PCA we only need the genotype info
gts <- snps %>%
select(CHROM, POS, ends_with("_gt"))
gts
remove the 38LTR gample
gts <- gts %>% select(-`38LTR_gt`)
convert this to numeric
geno.numeric <- gts %>%
select(-CHROM, -POS) %>%
lapply(factor) %>% # convert charcters to "factors", where each category is internally represented as a number
as.data.frame() %>% # reformat
data.matrix() %>%# convert to numeric
t()
colnames(geno.numeric) <- str_c(gts$CHROM, "_", gts$POS)
head(geno.numeric[,1:5],10)
GCZN01000007.1_2815 GCZN01000007.1_2834 GCZN01000007.1_2865 GCZN01000007.1_2881 GCZN01000007.1_2970
X42LTR_gt 2 3 2 2 2
X42LTRR_gt 2 2 2 2 1
X43LTR_gt 2 2 2 2 2
X43LTRR_gt 2 2 2 2 2
X49LTWR_gt 2 2 2 2 2
X49LTWRR_gt 3 3 3 3 1
X95LTWR_gt 2 3 3 2 2
X95LTWRR_gt 2 3 3 2 2
X99LTWR_gt 2 2 2 2 2
dim(geno.numeric)
[1] 9 134421
dim(gts)
[1] 134421 11
get the principal components
pca <- prcomp(geno.numeric)
summary(pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 78.9711 75.4129 60.8894 54.9581 52.3780 45.30719 43.88667 42.97555 1.288e-11
Proportion of Variance 0.2291 0.2089 0.1362 0.1110 0.1008 0.07541 0.07076 0.06785 0.000e+00
Cumulative Proportion 0.2291 0.4380 0.5742 0.6852 0.7860 0.86139 0.93215 1.00000 1.000e+00
plot it
plot.data <- pca$x %>%
as.data.frame %>%
rownames_to_column("sample") %>%
mutate(response=str_extract(sample, "(LTR|LTWR)")) %>%
gather(key="component", value="value",PC2:PC9)
plots <- map(sort(unique(plot.data$component)), function(x) {
plot.data %>%
filter(component==x) %>%
ggplot(aes(x=PC1, y= value, label=sample, color=response)) +
geom_point() + ylab(x)
}
)
plots
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
loadings <- as.data.frame(pca$rotation) %>%
rownames_to_column("snp") %>%
select(PC1, snp) %>%
arrange(desc(abs(PC1)))
loadings
contributions <- loadings %>%
separate(snp, into=c("contig", "pos"), sep="_") %>%
group_by(contig) %>%
summarize(abs.contribution = abs(sum(PC1)),
contribution = sum(PC1),
number_of_snps = n()) %>%
arrange(desc(abs.contribution))
contributions
bring in seq lengths
lengths <- read_csv("../input/Pinsaporeference1_lengths.csv", col_names = c("contig", "length"), skip=1) %>%
mutate(contig = str_remove(contig, " .*"))
Parsed with column specification:
cols(
contig = [31mcol_character()[39m,
length = [32mcol_double()[39m
)
lengths
contributions <- contributions %>%
left_join(lengths) %>%
mutate(snps_per_100bp = round(number_of_snps / length * 100, 2)) %>%
select(contig, contribution, length, number_of_snps, snps_per_100bp)
Joining, by = "contig"
contributions
write_csv(contributions, "../output/gene_contributions.csv.gz")
pc1pc2 <- pca$x %>%
as.data.frame() %>%
rownames_to_column("ID") %>%
select(ID, PC1, PC2) %>%
mutate(ID={str_replace(ID, "W", "N") %>%
str_replace("RR", "R2") %>%
str_remove_all("(X|_gt)") },
response=ifelse(str_detect(ID,"N"), "no recovery", "recovery"))
pc1pc2
pc1pc2 %>%
ggplot(aes(x=PC1, y = PC2, label=ID, color=response)) +
geom_point() +
geom_text_repel(show.legend=FALSE, direction="y")
ggsave("../output/PCA.pdf")
Saving 7.29 x 4.51 in image
Create a list of GCZN01054158.1 SNPs
GCZN01054158.1.loadings <- loadings %>%
filter(str_detect(snp, fixed("GCZN01054158.1"))) %>%
separate(snp,into = c("contig", "position"), sep="_",convert = TRUE) %>%
arrange(position)
GCZN01054158.1.loadings
GCZN01054158.snpinfo <- left_join(GCZN01054158.1.loadings, snps, by=c("contig" = "CHROM", "position" = "POS"))
GCZN01054158.snpinfo
write_csv(GCZN01054158.snpinfo, "../output/GCZN01054158.snpinfo.csv")
Create a list of SNPs in all genes with fixed differences
get_snps <- function(contig, loadings=loadings) {
loadings %>%
filter(str_detect(snp, fixed("GCZN01054158.1"))) %>%
separate(snp,into = c("contig", "position"), sep="_",convert = TRUE) %>%
arrange(position)
GCZN01054158.1.loadings
GCZN01054158.snpinfo <- left_join(GCZN01054158.1.loadings, snps, by=c("contig" = "CHROM", "position" = "POS"))
GCZN01054158.snpinfo
}
fixed_genes_snps <- loadings %>%
separate(snp,into = c("contig", "position"), sep="_",convert = TRUE, remove = FALSE) %>%
semi_join(fixed_genes, by="contig") %>%
left_join(snps, by=c("contig" = "CHROM", "position" = "POS")) %>%
arrange(contig, position)
write_csv(fixed_genes_snps, "../output/fixed_genes_snps.csv")